{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## baseline2版本，不参与建模的特征 ['os', 'version', 'lan', 'sid’]\n",
    "## Score = 88.094\n",
    "## 添加了osv的使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>label</th>\n",
       "      <th>lan</th>\n",
       "      <th>media_id</th>\n",
       "      <th>...</th>\n",
       "      <th>os</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>sid</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>316361</td>\n",
       "      <td>1199</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>104</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>9</td>\n",
       "      <td>18</td>\n",
       "      <td>1438873</td>\n",
       "      <td>1.559893e+12</td>\n",
       "      <td>8</td>\n",
       "      <td>2135019403</td>\n",
       "      <td>0</td>\n",
       "      <td>2329670524</td>\n",
       "      <td>601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>135939</td>\n",
       "      <td>893</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1185582</td>\n",
       "      <td>1.559994e+12</td>\n",
       "      <td>4</td>\n",
       "      <td>2782306428</td>\n",
       "      <td>1</td>\n",
       "      <td>2864801071</td>\n",
       "      <td>1000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>399254</td>\n",
       "      <td>821</td>\n",
       "      <td>0.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>559</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1555716</td>\n",
       "      <td>1.559837e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1392806005</td>\n",
       "      <td>2</td>\n",
       "      <td>628911675</td>\n",
       "      <td>696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>68983</td>\n",
       "      <td>1004</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2214.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>129</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1093419</td>\n",
       "      <td>1.560042e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3562553457</td>\n",
       "      <td>3</td>\n",
       "      <td>1283809327</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>288999</td>\n",
       "      <td>1076</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>64</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1400089</td>\n",
       "      <td>1.559867e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>2364522023</td>\n",
       "      <td>4</td>\n",
       "      <td>1510695983</td>\n",
       "      <td>582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499995</th>\n",
       "      <td>499995</td>\n",
       "      <td>392477</td>\n",
       "      <td>1028</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1920.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>144</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>7.1.2</td>\n",
       "      <td>25</td>\n",
       "      <td>1546078</td>\n",
       "      <td>1.559834e+12</td>\n",
       "      <td>7</td>\n",
       "      <td>861755946</td>\n",
       "      <td>79</td>\n",
       "      <td>140647032</td>\n",
       "      <td>373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499996</th>\n",
       "      <td>499996</td>\n",
       "      <td>346134</td>\n",
       "      <td>1001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1424.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1480612</td>\n",
       "      <td>1.559814e+12</td>\n",
       "      <td>3</td>\n",
       "      <td>1714444511</td>\n",
       "      <td>23</td>\n",
       "      <td>2745131047</td>\n",
       "      <td>525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499997</th>\n",
       "      <td>499997</td>\n",
       "      <td>499635</td>\n",
       "      <td>761</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>6.0.1</td>\n",
       "      <td>9</td>\n",
       "      <td>1698442</td>\n",
       "      <td>1.559676e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3843262581</td>\n",
       "      <td>25</td>\n",
       "      <td>1326115882</td>\n",
       "      <td>810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499998</th>\n",
       "      <td>499998</td>\n",
       "      <td>239786</td>\n",
       "      <td>917</td>\n",
       "      <td>46001.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh_CN</td>\n",
       "      <td>109</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>5.1.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1331155</td>\n",
       "      <td>1.559840e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1984296118</td>\n",
       "      <td>225</td>\n",
       "      <td>1446741112</td>\n",
       "      <td>772</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499999</th>\n",
       "      <td>499999</td>\n",
       "      <td>270531</td>\n",
       "      <td>929</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2040.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>78</td>\n",
       "      <td>1373973</td>\n",
       "      <td>1.559922e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>1697301943</td>\n",
       "      <td>49</td>\n",
       "      <td>1915763579</td>\n",
       "      <td>1076</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500000 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  android_id  apptype  carrier  dev_height  dev_ppi  \\\n",
       "0                0      316361     1199  46000.0         0.0      0.0   \n",
       "1                1      135939      893      0.0         0.0      0.0   \n",
       "2                2      399254      821      0.0       760.0      0.0   \n",
       "3                3       68983     1004  46000.0      2214.0      0.0   \n",
       "4                4      288999     1076  46000.0      2280.0      0.0   \n",
       "...            ...         ...      ...      ...         ...      ...   \n",
       "499995      499995      392477     1028  46000.0      1920.0      3.0   \n",
       "499996      499996      346134     1001      0.0      1424.0      0.0   \n",
       "499997      499997      499635      761  46000.0      1280.0      0.0   \n",
       "499998      499998      239786      917  46001.0       960.0      0.0   \n",
       "499999      499999      270531      929  46000.0      2040.0      3.0   \n",
       "\n",
       "        dev_width  label    lan  media_id  ...       os    osv package  \\\n",
       "0             0.0      1    NaN       104  ...  android      9      18   \n",
       "1             0.0      1    NaN        19  ...  android    8.1       0   \n",
       "2           360.0      1    NaN       559  ...  android  8.1.0       0   \n",
       "3          1080.0      0    NaN       129  ...  android  8.1.0       0   \n",
       "4          1080.0      1  zh-CN        64  ...  android  8.0.0       0   \n",
       "...           ...    ...    ...       ...  ...      ...    ...     ...   \n",
       "499995     1080.0      1  zh-CN       144  ...  Android  7.1.2      25   \n",
       "499996      720.0      0    NaN        29  ...  android  8.1.0       0   \n",
       "499997      720.0      0    NaN        54  ...  android  6.0.1       9   \n",
       "499998      540.0      0  zh_CN       109  ...  android  5.1.1       0   \n",
       "499999     1080.0      1  zh-CN        59  ...  Android  8.1.0      78   \n",
       "\n",
       "            sid     timestamp  version    fea_hash location   fea1_hash  \\\n",
       "0       1438873  1.559893e+12        8  2135019403        0  2329670524   \n",
       "1       1185582  1.559994e+12        4  2782306428        1  2864801071   \n",
       "2       1555716  1.559837e+12        0  1392806005        2   628911675   \n",
       "3       1093419  1.560042e+12        0  3562553457        3  1283809327   \n",
       "4       1400089  1.559867e+12        5  2364522023        4  1510695983   \n",
       "...         ...           ...      ...         ...      ...         ...   \n",
       "499995  1546078  1.559834e+12        7   861755946       79   140647032   \n",
       "499996  1480612  1.559814e+12        3  1714444511       23  2745131047   \n",
       "499997  1698442  1.559676e+12        0  3843262581       25  1326115882   \n",
       "499998  1331155  1.559840e+12        0  1984296118      225  1446741112   \n",
       "499999  1373973  1.559922e+12        5  1697301943       49  1915763579   \n",
       "\n",
       "        cus_type  \n",
       "0            601  \n",
       "1           1000  \n",
       "2            696  \n",
       "3            753  \n",
       "4            582  \n",
       "...          ...  \n",
       "499995       373  \n",
       "499996       525  \n",
       "499997       810  \n",
       "499998       772  \n",
       "499999      1076  \n",
       "\n",
       "[500000 rows x 21 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# 数据加载\n",
    "train = pd.read_csv('./train.csv')\n",
    "test = pd.read_csv('./test.csv')\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>label</th>\n",
       "      <th>lan</th>\n",
       "      <th>media_id</th>\n",
       "      <th>ntt</th>\n",
       "      <th>os</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>sid</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>316361</td>\n",
       "      <td>1199</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>104</td>\n",
       "      <td>6.0</td>\n",
       "      <td>android</td>\n",
       "      <td>9</td>\n",
       "      <td>18</td>\n",
       "      <td>1438873</td>\n",
       "      <td>1.559893e+12</td>\n",
       "      <td>8</td>\n",
       "      <td>2135019403</td>\n",
       "      <td>0</td>\n",
       "      <td>2329670524</td>\n",
       "      <td>601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>135939</td>\n",
       "      <td>893</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19</td>\n",
       "      <td>6.0</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1185582</td>\n",
       "      <td>1.559994e+12</td>\n",
       "      <td>4</td>\n",
       "      <td>2782306428</td>\n",
       "      <td>1</td>\n",
       "      <td>2864801071</td>\n",
       "      <td>1000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>399254</td>\n",
       "      <td>821</td>\n",
       "      <td>0.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>559</td>\n",
       "      <td>0.0</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1555716</td>\n",
       "      <td>1.559837e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1392806005</td>\n",
       "      <td>2</td>\n",
       "      <td>628911675</td>\n",
       "      <td>696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>68983</td>\n",
       "      <td>1004</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2214.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>129</td>\n",
       "      <td>2.0</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1093419</td>\n",
       "      <td>1.560042e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3562553457</td>\n",
       "      <td>3</td>\n",
       "      <td>1283809327</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>288999</td>\n",
       "      <td>1076</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>64</td>\n",
       "      <td>2.0</td>\n",
       "      <td>android</td>\n",
       "      <td>8.0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1400089</td>\n",
       "      <td>1.559867e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>2364522023</td>\n",
       "      <td>4</td>\n",
       "      <td>1510695983</td>\n",
       "      <td>582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499995</th>\n",
       "      <td>392477</td>\n",
       "      <td>1028</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1920.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>144</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Android</td>\n",
       "      <td>7.1.2</td>\n",
       "      <td>25</td>\n",
       "      <td>1546078</td>\n",
       "      <td>1.559834e+12</td>\n",
       "      <td>7</td>\n",
       "      <td>861755946</td>\n",
       "      <td>79</td>\n",
       "      <td>140647032</td>\n",
       "      <td>373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499996</th>\n",
       "      <td>346134</td>\n",
       "      <td>1001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1424.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1480612</td>\n",
       "      <td>1.559814e+12</td>\n",
       "      <td>3</td>\n",
       "      <td>1714444511</td>\n",
       "      <td>23</td>\n",
       "      <td>2745131047</td>\n",
       "      <td>525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499997</th>\n",
       "      <td>499635</td>\n",
       "      <td>761</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54</td>\n",
       "      <td>6.0</td>\n",
       "      <td>android</td>\n",
       "      <td>6.0.1</td>\n",
       "      <td>9</td>\n",
       "      <td>1698442</td>\n",
       "      <td>1.559676e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3843262581</td>\n",
       "      <td>25</td>\n",
       "      <td>1326115882</td>\n",
       "      <td>810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499998</th>\n",
       "      <td>239786</td>\n",
       "      <td>917</td>\n",
       "      <td>46001.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh_CN</td>\n",
       "      <td>109</td>\n",
       "      <td>2.0</td>\n",
       "      <td>android</td>\n",
       "      <td>5.1.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1331155</td>\n",
       "      <td>1.559840e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1984296118</td>\n",
       "      <td>225</td>\n",
       "      <td>1446741112</td>\n",
       "      <td>772</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499999</th>\n",
       "      <td>270531</td>\n",
       "      <td>929</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2040.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>59</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>78</td>\n",
       "      <td>1373973</td>\n",
       "      <td>1.559922e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>1697301943</td>\n",
       "      <td>49</td>\n",
       "      <td>1915763579</td>\n",
       "      <td>1076</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500000 rows × 20 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        android_id  apptype  carrier  dev_height  dev_ppi  dev_width  label  \\\n",
       "0           316361     1199  46000.0         0.0      0.0        0.0      1   \n",
       "1           135939      893      0.0         0.0      0.0        0.0      1   \n",
       "2           399254      821      0.0       760.0      0.0      360.0      1   \n",
       "3            68983     1004  46000.0      2214.0      0.0     1080.0      0   \n",
       "4           288999     1076  46000.0      2280.0      0.0     1080.0      1   \n",
       "...            ...      ...      ...         ...      ...        ...    ...   \n",
       "499995      392477     1028  46000.0      1920.0      3.0     1080.0      1   \n",
       "499996      346134     1001      0.0      1424.0      0.0      720.0      0   \n",
       "499997      499635      761  46000.0      1280.0      0.0      720.0      0   \n",
       "499998      239786      917  46001.0       960.0      0.0      540.0      0   \n",
       "499999      270531      929  46000.0      2040.0      3.0     1080.0      1   \n",
       "\n",
       "          lan  media_id  ntt       os    osv  package      sid     timestamp  \\\n",
       "0         NaN       104  6.0  android      9       18  1438873  1.559893e+12   \n",
       "1         NaN        19  6.0  android    8.1        0  1185582  1.559994e+12   \n",
       "2         NaN       559  0.0  android  8.1.0        0  1555716  1.559837e+12   \n",
       "3         NaN       129  2.0  android  8.1.0        0  1093419  1.560042e+12   \n",
       "4       zh-CN        64  2.0  android  8.0.0        0  1400089  1.559867e+12   \n",
       "...       ...       ...  ...      ...    ...      ...      ...           ...   \n",
       "499995  zh-CN       144  6.0  Android  7.1.2       25  1546078  1.559834e+12   \n",
       "499996    NaN        29  2.0  android  8.1.0        0  1480612  1.559814e+12   \n",
       "499997    NaN        54  6.0  android  6.0.1        9  1698442  1.559676e+12   \n",
       "499998  zh_CN       109  2.0  android  5.1.1        0  1331155  1.559840e+12   \n",
       "499999  zh-CN        59  2.0  Android  8.1.0       78  1373973  1.559922e+12   \n",
       "\n",
       "       version    fea_hash  location   fea1_hash  cus_type  \n",
       "0            8  2135019403         0  2329670524       601  \n",
       "1            4  2782306428         1  2864801071      1000  \n",
       "2            0  1392806005         2   628911675       696  \n",
       "3            0  3562553457         3  1283809327       753  \n",
       "4            5  2364522023         4  1510695983       582  \n",
       "...        ...         ...       ...         ...       ...  \n",
       "499995       7   861755946        79   140647032       373  \n",
       "499996       3  1714444511        23  2745131047       525  \n",
       "499997       0  3843262581        25  1326115882       810  \n",
       "499998       0  1984296118       225  1446741112       772  \n",
       "499999       5  1697301943        49  1915763579      1076  \n",
       "\n",
       "[500000 rows x 20 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = test.iloc[:, 1:]\n",
    "train = train.iloc[:, 1:]\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "lan    183280\n",
       "osv      6561\n",
       "dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train.info()\n",
    "#train['lan'].value_counts()\n",
    "# Object类型： lan, os, osv, version, fea_hash\n",
    "# 字符串类型 需要转换为数值（labelencoder）\n",
    "object_cols = train.select_dtypes(include='object').columns\n",
    "\n",
    "# 缺失值个数\n",
    "temp = train.isnull().sum()\n",
    "# 有缺失值的字段： lan, osv\n",
    "temp[temp>0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Object类型： lan, os, osv, version, fea_hash\n",
    "##### 有缺失值的字段： lan, osv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi', 'dev_width', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package', 'sid', 'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type']\n"
     ]
    }
   ],
   "source": [
    "# ['os', 'osv', 'lan', 'sid’]\n",
    "features = train.columns.tolist()\n",
    "features.remove('label')\n",
    "print(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "android_id 362258\n",
      "apptype 89\n",
      "carrier 5\n",
      "dev_height 798\n",
      "dev_ppi 92\n",
      "dev_width 346\n",
      "lan 21\n",
      "media_id 284\n",
      "ntt 8\n",
      "os 2\n",
      "osv 154\n",
      "package 1950\n",
      "sid 500000\n",
      "timestamp 500000\n",
      "version 22\n",
      "fea_hash 402980\n",
      "location 332\n",
      "fea1_hash 4959\n",
      "cus_type 58\n"
     ]
    }
   ],
   "source": [
    "for feature in features:\n",
    "    print(feature, train[feature].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10    378925\n",
       "9     108904\n",
       "8      11235\n",
       "7        740\n",
       "6         93\n",
       "38        37\n",
       "39        28\n",
       "37        16\n",
       "5         11\n",
       "36         3\n",
       "33         2\n",
       "32         2\n",
       "1          2\n",
       "31         1\n",
       "30         1\n",
       "Name: fea_hash, dtype: int64"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Thinking: fea_hash是否要做特征变换？\n",
    "#train['fea_hash'].value_counts()\n",
    "#train['fea_hash'].describe()\n",
    "train['fea_hash'].map(lambda x: len(str(x))).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10    391669\n",
       "9      99347\n",
       "8       8977\n",
       "7          6\n",
       "5          1\n",
       "Name: fea1_hash, dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train['fea1_hash'].value_counts()\n",
    "train['fea1_hash'].map(lambda x: len(str(x))).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理osv\n",
    "def trans_osv(osv):\n",
    "    global result\n",
    "    osv = str(osv).replace(' ','').replace('.','').replace('Android_','').replace('十核20G_HD','').replace('Android','').replace('W','')\n",
    "    if osv == 'nan' or osv == 'GIONEE_YNGA':\n",
    "        result = 810\n",
    "    elif osv.count('-') >0:\n",
    "        result = int(osv.split('-')[0])\n",
    "    elif osv == 'f073b_changxiang_v01_b1b8_20180915':\n",
    "        result = 810\n",
    "    elif osv == '%E6%B1%9F%E7%81%B5OS+50':\n",
    "        result = 500\n",
    "    else:\n",
    "        result = int(osv)\n",
    "        \n",
    "    if result < 10:\n",
    "        result = result * 100\n",
    "    elif  result < 100:\n",
    "        result = result * 10\n",
    "        \n",
    "    return int(result)\n",
    "\n",
    "# train['osv_trans'] = train['osv'].apply(trans_osv)\n",
    "# train['osv_trans']\n",
    "# train['osv'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['android_id',\n",
       " 'apptype',\n",
       " 'carrier',\n",
       " 'dev_height',\n",
       " 'dev_ppi',\n",
       " 'dev_width',\n",
       " 'media_id',\n",
       " 'ntt',\n",
       " 'osv',\n",
       " 'package',\n",
       " 'timestamp',\n",
       " 'version',\n",
       " 'fea_hash',\n",
       " 'location',\n",
       " 'fea1_hash',\n",
       " 'cus_type']"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remove_list = ['os', 'lan', 'sid']\n",
    "col = features\n",
    "for i in remove_list:\n",
    "    col.remove(i)\n",
    "col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>media_id</th>\n",
       "      <th>ntt</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "      <th>fea_hash_len</th>\n",
       "      <th>fea1_hash_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>316361</td>\n",
       "      <td>1199</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>104</td>\n",
       "      <td>6.0</td>\n",
       "      <td>900</td>\n",
       "      <td>18</td>\n",
       "      <td>1.559893e+12</td>\n",
       "      <td>8</td>\n",
       "      <td>2135019403</td>\n",
       "      <td>0</td>\n",
       "      <td>2329670524</td>\n",
       "      <td>601</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>135939</td>\n",
       "      <td>893</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19</td>\n",
       "      <td>6.0</td>\n",
       "      <td>810</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559994e+12</td>\n",
       "      <td>4</td>\n",
       "      <td>2782306428</td>\n",
       "      <td>1</td>\n",
       "      <td>2864801071</td>\n",
       "      <td>1000</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>399254</td>\n",
       "      <td>821</td>\n",
       "      <td>0.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>559</td>\n",
       "      <td>0.0</td>\n",
       "      <td>810</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559837e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1392806005</td>\n",
       "      <td>2</td>\n",
       "      <td>628911675</td>\n",
       "      <td>696</td>\n",
       "      <td>10</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>68983</td>\n",
       "      <td>1004</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2214.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>129</td>\n",
       "      <td>2.0</td>\n",
       "      <td>810</td>\n",
       "      <td>0</td>\n",
       "      <td>1.560042e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3562553457</td>\n",
       "      <td>3</td>\n",
       "      <td>1283809327</td>\n",
       "      <td>753</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>288999</td>\n",
       "      <td>1076</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>64</td>\n",
       "      <td>2.0</td>\n",
       "      <td>800</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559867e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>2364522023</td>\n",
       "      <td>4</td>\n",
       "      <td>1510695983</td>\n",
       "      <td>582</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499995</th>\n",
       "      <td>392477</td>\n",
       "      <td>1028</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1920.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>144</td>\n",
       "      <td>6.0</td>\n",
       "      <td>712</td>\n",
       "      <td>25</td>\n",
       "      <td>1.559834e+12</td>\n",
       "      <td>7</td>\n",
       "      <td>861755946</td>\n",
       "      <td>79</td>\n",
       "      <td>140647032</td>\n",
       "      <td>373</td>\n",
       "      <td>9</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499996</th>\n",
       "      <td>346134</td>\n",
       "      <td>1001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1424.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>810</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559814e+12</td>\n",
       "      <td>3</td>\n",
       "      <td>1714444511</td>\n",
       "      <td>23</td>\n",
       "      <td>2745131047</td>\n",
       "      <td>525</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499997</th>\n",
       "      <td>499635</td>\n",
       "      <td>761</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>54</td>\n",
       "      <td>6.0</td>\n",
       "      <td>601</td>\n",
       "      <td>9</td>\n",
       "      <td>1.559676e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3843262581</td>\n",
       "      <td>25</td>\n",
       "      <td>1326115882</td>\n",
       "      <td>810</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499998</th>\n",
       "      <td>239786</td>\n",
       "      <td>917</td>\n",
       "      <td>46001.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>109</td>\n",
       "      <td>2.0</td>\n",
       "      <td>511</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559840e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1984296118</td>\n",
       "      <td>225</td>\n",
       "      <td>1446741112</td>\n",
       "      <td>772</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499999</th>\n",
       "      <td>270531</td>\n",
       "      <td>929</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2040.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>59</td>\n",
       "      <td>2.0</td>\n",
       "      <td>810</td>\n",
       "      <td>78</td>\n",
       "      <td>1.559922e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>1697301943</td>\n",
       "      <td>49</td>\n",
       "      <td>1915763579</td>\n",
       "      <td>1076</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500000 rows × 18 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        android_id  apptype  carrier  dev_height  dev_ppi  dev_width  \\\n",
       "0           316361     1199  46000.0         0.0      0.0        0.0   \n",
       "1           135939      893      0.0         0.0      0.0        0.0   \n",
       "2           399254      821      0.0       760.0      0.0      360.0   \n",
       "3            68983     1004  46000.0      2214.0      0.0     1080.0   \n",
       "4           288999     1076  46000.0      2280.0      0.0     1080.0   \n",
       "...            ...      ...      ...         ...      ...        ...   \n",
       "499995      392477     1028  46000.0      1920.0      3.0     1080.0   \n",
       "499996      346134     1001      0.0      1424.0      0.0      720.0   \n",
       "499997      499635      761  46000.0      1280.0      0.0      720.0   \n",
       "499998      239786      917  46001.0       960.0      0.0      540.0   \n",
       "499999      270531      929  46000.0      2040.0      3.0     1080.0   \n",
       "\n",
       "        media_id  ntt  osv  package     timestamp version    fea_hash  \\\n",
       "0            104  6.0  900       18  1.559893e+12       8  2135019403   \n",
       "1             19  6.0  810        0  1.559994e+12       4  2782306428   \n",
       "2            559  0.0  810        0  1.559837e+12       0  1392806005   \n",
       "3            129  2.0  810        0  1.560042e+12       0  3562553457   \n",
       "4             64  2.0  800        0  1.559867e+12       5  2364522023   \n",
       "...          ...  ...  ...      ...           ...     ...         ...   \n",
       "499995       144  6.0  712       25  1.559834e+12       7   861755946   \n",
       "499996        29  2.0  810        0  1.559814e+12       3  1714444511   \n",
       "499997        54  6.0  601        9  1.559676e+12       0  3843262581   \n",
       "499998       109  2.0  511        0  1.559840e+12       0  1984296118   \n",
       "499999        59  2.0  810       78  1.559922e+12       5  1697301943   \n",
       "\n",
       "        location   fea1_hash  cus_type  fea_hash_len  fea1_hash_len  \n",
       "0              0  2329670524       601            10             10  \n",
       "1              1  2864801071      1000            10             10  \n",
       "2              2   628911675       696            10              9  \n",
       "3              3  1283809327       753            10             10  \n",
       "4              4  1510695983       582            10             10  \n",
       "...          ...         ...       ...           ...            ...  \n",
       "499995        79   140647032       373             9              9  \n",
       "499996        23  2745131047       525            10             10  \n",
       "499997        25  1326115882       810            10             10  \n",
       "499998       225  1446741112       772            10             10  \n",
       "499999        49  1915763579      1076            10             10  \n",
       "\n",
       "[500000 rows x 18 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 特征筛选\n",
    "features = train[col]\n",
    "# 构造fea_hash_len特征\n",
    "features['fea_hash_len'] = features['fea_hash'].map(lambda x: len(str(x)))\n",
    "features['fea1_hash_len'] = features['fea1_hash'].map(lambda x: len(str(x)))\n",
    "# Thinking：为什么将很大的，很长的fea_hash化为0？\n",
    "# 如果fea_hash很长，都归为0，否则为自己的本身\n",
    "features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))\n",
    "features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))\n",
    "\n",
    "\n",
    "features['osv'] = features['osv'].apply(trans_osv)\n",
    "\n",
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>media_id</th>\n",
       "      <th>ntt</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "      <th>fea_hash_len</th>\n",
       "      <th>fea1_hash_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>317625</td>\n",
       "      <td>1181</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2196.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>639</td>\n",
       "      <td>2.0</td>\n",
       "      <td>810</td>\n",
       "      <td>188</td>\n",
       "      <td>1.559872e+12</td>\n",
       "      <td>7</td>\n",
       "      <td>1672223856</td>\n",
       "      <td>57</td>\n",
       "      <td>3872258917</td>\n",
       "      <td>658</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>435108</td>\n",
       "      <td>944</td>\n",
       "      <td>46003.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>704</td>\n",
       "      <td>6.0</td>\n",
       "      <td>810</td>\n",
       "      <td>221</td>\n",
       "      <td>1.559739e+12</td>\n",
       "      <td>3</td>\n",
       "      <td>3767901757</td>\n",
       "      <td>23</td>\n",
       "      <td>129322164</td>\n",
       "      <td>943</td>\n",
       "      <td>10</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1106</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>39</td>\n",
       "      <td>2.0</td>\n",
       "      <td>510</td>\n",
       "      <td>1562</td>\n",
       "      <td>1.559614e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>454638703</td>\n",
       "      <td>30</td>\n",
       "      <td>4226678391</td>\n",
       "      <td>411</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>451504</td>\n",
       "      <td>761</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1344.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>54</td>\n",
       "      <td>2.0</td>\n",
       "      <td>711</td>\n",
       "      <td>9</td>\n",
       "      <td>1.559668e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1507622951</td>\n",
       "      <td>65</td>\n",
       "      <td>3355419572</td>\n",
       "      <td>848</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>665.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>29</td>\n",
       "      <td>5.0</td>\n",
       "      <td>810</td>\n",
       "      <td>4</td>\n",
       "      <td>1.559694e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>4116351093</td>\n",
       "      <td>148</td>\n",
       "      <td>2644467751</td>\n",
       "      <td>411</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149995</th>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>810</td>\n",
       "      <td>4</td>\n",
       "      <td>1.559957e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3162887451</td>\n",
       "      <td>126</td>\n",
       "      <td>2711576615</td>\n",
       "      <td>411</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149996</th>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>900</td>\n",
       "      <td>4</td>\n",
       "      <td>1.559863e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>97238959</td>\n",
       "      <td>322</td>\n",
       "      <td>2678022183</td>\n",
       "      <td>411</td>\n",
       "      <td>8</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149997</th>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>29</td>\n",
       "      <td>5.0</td>\n",
       "      <td>810</td>\n",
       "      <td>4</td>\n",
       "      <td>1.560041e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1320118495</td>\n",
       "      <td>46</td>\n",
       "      <td>2610913319</td>\n",
       "      <td>411</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149998</th>\n",
       "      <td>500925</td>\n",
       "      <td>1052</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>854.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>480.0</td>\n",
       "      <td>249</td>\n",
       "      <td>6.0</td>\n",
       "      <td>442</td>\n",
       "      <td>0</td>\n",
       "      <td>1.559688e+12</td>\n",
       "      <td>2</td>\n",
       "      <td>1292986591</td>\n",
       "      <td>41</td>\n",
       "      <td>1898209327</td>\n",
       "      <td>430</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149999</th>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>900</td>\n",
       "      <td>4</td>\n",
       "      <td>1.559950e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>259614175</td>\n",
       "      <td>122</td>\n",
       "      <td>2594136103</td>\n",
       "      <td>411</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150000 rows × 18 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        android_id  apptype  carrier  dev_height  dev_ppi  dev_width  \\\n",
       "0           317625     1181  46000.0      2196.0      2.0     1080.0   \n",
       "1           435108      944  46003.0      2280.0      3.0     1080.0   \n",
       "2                0     1106  46000.0         0.0      0.0        0.0   \n",
       "3           451504      761  46000.0      1344.0      0.0      720.0   \n",
       "4                0     1001  46000.0       665.0      0.0      320.0   \n",
       "...            ...      ...      ...         ...      ...        ...   \n",
       "149995           0     1001  46000.0       760.0      0.0      360.0   \n",
       "149996           0     1001  46000.0       780.0      0.0      360.0   \n",
       "149997           0     1001  46000.0       780.0      0.0      360.0   \n",
       "149998      500925     1052  46000.0       854.0    240.0      480.0   \n",
       "149999           0     1001  46000.0       780.0      0.0      360.0   \n",
       "\n",
       "        media_id  ntt  osv  package     timestamp version    fea_hash  \\\n",
       "0            639  2.0  810      188  1.559872e+12       7  1672223856   \n",
       "1            704  6.0  810      221  1.559739e+12       3  3767901757   \n",
       "2             39  2.0  510     1562  1.559614e+12       0   454638703   \n",
       "3             54  2.0  711        9  1.559668e+12       0  1507622951   \n",
       "4             29  5.0  810        4  1.559694e+12       0  4116351093   \n",
       "...          ...  ...  ...      ...           ...     ...         ...   \n",
       "149995        29  2.0  810        4  1.559957e+12       0  3162887451   \n",
       "149996        29  2.0  900        4  1.559863e+12       0    97238959   \n",
       "149997        29  5.0  810        4  1.560041e+12       0  1320118495   \n",
       "149998       249  6.0  442        0  1.559688e+12       2  1292986591   \n",
       "149999        29  2.0  900        4  1.559950e+12       0   259614175   \n",
       "\n",
       "        location   fea1_hash  cus_type  fea_hash_len  fea1_hash_len  \n",
       "0             57  3872258917       658            10             10  \n",
       "1             23   129322164       943            10              9  \n",
       "2             30  4226678391       411             9             10  \n",
       "3             65  3355419572       848            10             10  \n",
       "4            148  2644467751       411            10             10  \n",
       "...          ...         ...       ...           ...            ...  \n",
       "149995       126  2711576615       411            10             10  \n",
       "149996       322  2678022183       411             8             10  \n",
       "149997        46  2610913319       411            10             10  \n",
       "149998        41  1898209327       430            10             10  \n",
       "149999       122  2594136103       411             9             10  \n",
       "\n",
       "[150000 rows x 18 columns]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_features = test[col]\n",
    "# 构造fea_hash_len特征\n",
    "test_features['fea_hash_len'] = test_features['fea_hash'].map(lambda x: len(str(x)))\n",
    "test_features['fea1_hash_len'] = test_features['fea1_hash'].map(lambda x: len(str(x)))\n",
    "# Thinking：为什么将很大的，很长的fea_hash化为0？\n",
    "# 如果fea_hash很长，都归为0，否则为自己的本身\n",
    "test_features['fea_hash'] = test_features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))\n",
    "test_features['fea1_hash'] = test_features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))\n",
    "\n",
    "test_features['osv'] = test_features['osv'].apply(trans_osv)\n",
    "\n",
    "test_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, ..., 1, 1, 1], dtype=int64)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train['os'].value_counts()\n",
    "# 使用LGBM训练\n",
    "import lightgbm as lgb\n",
    "model = lgb.LGBMClassifier()\n",
    "# 模型训练\n",
    "model.fit(features.drop(['timestamp', 'version'], axis=1), train['label'])\n",
    "result = model.predict(test_features.drop(['timestamp', 'version'], axis=1))\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sid</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1440682</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1606824</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1774642</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1742535</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1689686</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149995</th>\n",
       "      <td>1165373</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149996</th>\n",
       "      <td>1444115</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149997</th>\n",
       "      <td>1134378</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149998</th>\n",
       "      <td>1700238</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149999</th>\n",
       "      <td>1201539</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            sid  label\n",
       "0       1440682      0\n",
       "1       1606824      1\n",
       "2       1774642      0\n",
       "3       1742535      0\n",
       "4       1689686      1\n",
       "...         ...    ...\n",
       "149995  1165373      1\n",
       "149996  1444115      1\n",
       "149997  1134378      1\n",
       "149998  1700238      1\n",
       "149999  1201539      1\n",
       "\n",
       "[150000 rows x 2 columns]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#features['version'].value_counts()\n",
    "res = pd.DataFrame(test['sid'])\n",
    "res['label'] = result\n",
    "res.to_csv('./baseline.csv', index=False)\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
